TL;DR

Applying our zero-inflated model to the olfactory data.

Unnormalized data

We select only the cells that pass QC, color coded by the experimental time point. Note that the data are not public, hence it should not work other than on Davide’s computer.

I think Kelly said the data are now publicly available. How do we get the publicly available dataset?

load("../data/Expt4c2b_filtdata.Rda")
load("../data/E4c2b_slingshot_wsforkelly.RData")
names(batch) <- colnames(counts)

counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]

vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]

We have 616 cells. To speed up the computations, we will focus on the top 1,000 most variable genes.

dim(core)
## [1] 1000  616
core[1:3, 1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1             8763            7221            3581
## Cbr2              5799            3638            1448
## Cyp2f2            2158            2027            1078

Cells have been processed in 18 different batches

table(batch)
## batch
## GBC08A GBC08B GBC09A GBC09B    P01    P02   P03A   P03B    P04    P05 
##     36     33     30     17     25     43     37     33     14     20 
##    P06    P10    P11    P12    P13    P14    Y01    Y04 
##     39     36     44     40     47     39     51     32

Cells have been clustered into 13 different clusters

table(clus.labels)
## clus.labels
##  1  2  3  4  5  7  8  9 10 11 12 14 15 
## 91 25 56 40 96 60 28 79 26 22 35 26 32

Batches are kind of confounded with the biology

table(data.frame(batch = as.vector(batch),
                 cluster = clus.labels))
##         cluster
## batch     1  2  3  4  5  7  8  9 10 11 12 14 15
##   GBC08A  0  2 12  9  0  0  0  0  0  2  0  2  9
##   GBC08B  0  7  5  3  0  0  0  1  2  4  0  5  6
##   GBC09A  0  1  5  9  0  0  0  1  1  0  0  6  7
##   GBC09B  0  2  2  7  0  0  0  3  0  0  0  3  0
##   P01     0  2  4  3 15  1  0  0  0  0  0  0  0
##   P02     2  0  9  3 15  3  3  2  3  0  2  1  0
##   P03A    3  0  3  0 12  2  9  4  2  0  2  0  0
##   P03B    1  2  1  1 11  1  2 10  1  1  2  0  0
##   P04     0  0  0  0 11  1  0  1  1  0  0  0  0
##   P05     0  0  0  1 11  3  0  1  0  2  2  0  0
##   P06     1  2  3  0  8  2  4  8  4  1  2  2  2
##   P10     3  1  4  0  4  5  9  2  0  2  5  0  1
##   P11     2  1  1  0  1  5  1 22  3  1  6  0  1
##   P12     0  2  0  0  4 10  0  8  2  3  6  4  1
##   P13     1  2  4  0  4 15  0  4  5  6  1  3  2
##   P14     0  0  1  2  0 12  0 12  2  0  7  0  3
##   Y01    47  1  1  2  0  0  0  0  0  0  0  0  0
##   Y04    31  0  1  0  0  0  0  0  0  0  0  0  0

Could you remind me where the clusters come from?

For each cell, we dispose of qc measures

head(qc, 2)
##                  NREADS NALIGNED  RALIGN TOTAL_DUP    PRIMER
## OEP01_N706_S501 3313260  3167600 95.6035   47.9943 0.0154566
## OEP01_N701_S501 2902430  2757790 95.0167   45.0150 0.0182066
##                 PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES
## OEP01_N706_S501               2e-06         0.200130      0.230654
## OEP01_N701_S501               0e+00         0.182461      0.201810
##                 PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES
## OEP01_N706_S501           0.404205             0.165009       0.430784
## OEP01_N701_S501           0.465702             0.150027       0.384271
##                 MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS
## OEP01_N706_S501           0.843857           0.061028           0.521079
## OEP01_N701_S501           0.914370           0.033350           0.373993

PCA

First, let’s look at PCA (of the log counts) for reference.

par(mfrow=c(1, 2))
detection_rate <- colSums(core>0)
coverage <- colSums(core)
colMerged <- cc_rev[clus.labels]

pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=19, main="PCA of log-counts, centered not scaled")

fq <- EDASeq::betweenLaneNormalization(counts, which="full")
pcafq <- prcomp(t(log1p(fq)))
plot(pcafq$x, col=colMerged, pch=19, main="PCA of FQ log-counts, centered not scaled")

The first plot is the unnormalized data and the second plot is after full-quantile normalization, which is what Russell and Diya used for the paper.

Do we have a reference for this paper?

Is it a fair comparison?? for pca on log counts, we use the 1000 most variable genes, but for full quantile normalization, we use all the genes. I understand that to perform full quantile normalization we need all the genes, but then should we look at only the 1000 most variable genes defined above, no?

They found that to fully explain the differences between clusters, we need three dimensions.

pairs(pcafq$x[,1:3], col=colMerged, pch=19,
      main="PCA of FQ log-counts, centered not scaled")

df <- data.frame(PC1 = pcafq$x[, 1],
                 PC2 = pcafq$x[, 2],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch = 19, col = colMerged)

print(cor(df, method = "spearman"))
##                        PC1        PC2 detection_rate    coverage
## PC1             1.00000000 -0.0432585      0.2643194 -0.02214266
## PC2            -0.04325850  1.0000000      0.6508453  0.19979644
## detection_rate  0.26431940  0.6508453      1.0000000  0.41994811
## coverage       -0.02214266  0.1997964      0.4199481  1.00000000

Even after full-quantile normalization, there is some residual correlation between PC2 and detection rate. Note that, as expected, detection rate and coverage are correlated.

totalcount = function (ei) {
  sums = colSums(ei)
  eo = t(t(ei)*mean(sums)/sums)
  return(eo)
}
raw <- as.matrix(counts)
tc <- totalcount(raw)
fq <- FQT_FN(raw)
tmm <- TMM_FN(raw)

fastpca <- function(expr, scale=FALSE) {
  svd_raw <- svds(scale(t(expr), center=TRUE, scale=scale), k=3, nu=3, nv=0)
  pc_raw <- svd_raw$u %*% diag(svd_raw$d[1:3])
  return(pc_raw)
}
vargenes <- rownames(core)
pc_raw <- fastpca(log1p(raw[vargenes,]))
pc_tc <- fastpca(log1p(tc[vargenes,]))
pc_fq <- fastpca(log1p(fq[vargenes,]))
pc_tmm <- fastpca(log1p(tmm[vargenes,]))

ZIFA

Let’s now look at zifa.

wrapRzifa <- function(Y, block = TRUE, k=2){
  # wrapper R function for ZIFA.
  # md5 hashing and temporary files are used not to re-run zifa 
  # if it has already be run on this computer.
  d = digest(Y, "md5")
  tmp = paste0(tempdir(), '/', d)
  write.csv(Y, paste0(tmp, '.csv'))
  
  if (!file.exists(paste0(tmp, "_", k, '_zifa.csv'))){
    print('run ZIFA')
    bb = ifelse(block, '-b ', '')
    cmd = sprintf('python run_zifa.py -d %d %s%s.csv %s_%d_zifa.csv', k, bb, tmp, tmp, k)
    system(cmd)
  }
  read.csv(sprintf("%s_%d_zifa.csv", tmp, k), header=FALSE)
}

zifa_raw <- wrapRzifa(log1p(raw[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_tc <- wrapRzifa(log1p(tc[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_tmm <- wrapRzifa(log1p(tmm[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_fq <- wrapRzifa(log1p(fq[vargenes,]), k=3)
## [1] "run ZIFA"
pairs(zifa_raw[,1:3], col=colMerged, pch=19,
      main="ZIFA of raw log-counts")

pairs(zifa_fq[,1:3], col=colMerged, pch=19,
      main="ZIFA of FQ log-counts")

ZINB

Three dimensions

print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3)))
##    user  system elapsed 
## 220.663  43.479 111.175
pairs(zinb_3@W, col = colMerged, pch = 19, main = "ZINB")

qcpca <- prcomp(qc, center = TRUE, scale. = TRUE)
df <- data.frame(W1 = zinb_3@W[,1],
                 W2 = zinb_3@W[,2], 
                 W3 = zinb_3@W[,3],
                 gamma_mu = zinb_3@gamma_mu[1,],
                 gamma_pi = zinb_3@gamma_pi[1,],
                 detection_rate=detection_rate,
                 coverage = coverage, 
                 quality = qcpca$x[,1])
pairs(df, pch = 19, col = colMerged)

print(cor(df, method = "spearman"))
##                         W1          W2          W3    gamma_mu    gamma_pi
## W1              1.00000000 -0.02414526 -0.09064496  0.08205038  0.15669401
## W2             -0.02414526  1.00000000 -0.01166085 -0.03726806  0.20835852
## W3             -0.09064496 -0.01166085  1.00000000 -0.13109039 -0.01740836
## gamma_mu        0.08205038 -0.03726806 -0.13109039  1.00000000 -0.07958898
## gamma_pi        0.15669401  0.20835852 -0.01740836 -0.07958898  1.00000000
## detection_rate -0.15316526 -0.32159255  0.01930138  0.20659231 -0.97309429
## coverage        0.07916631  0.01772106  0.04078241  0.89501409 -0.32085525
## quality         0.05742306  0.11847496 -0.05564158 -0.55196048  0.30254576
##                detection_rate    coverage     quality
## W1                -0.15316526  0.07916631  0.05742306
## W2                -0.32159255  0.01772106  0.11847496
## W3                 0.01930138  0.04078241 -0.05564158
## gamma_mu           0.20659231  0.89501409 -0.55196048
## gamma_pi          -0.97309429 -0.32085525  0.30254576
## detection_rate     1.00000000  0.41994811 -0.34747756
## coverage           0.41994811  1.00000000 -0.60944735
## quality           -0.34747756 -0.60944735  1.00000000

Five dimensions

print(system.time(zinb_5 <- zinbFit(core, ncores = 3, K = 5)))
##    user  system elapsed 
## 274.465  42.470 131.200
pairs(zinb_5@W, col = colMerged, pch = 19, main = "ZINB")

Ten dimensions

print(system.time(zinb_10 <- zinbFit(core, ncores = 3, K = 10)))
##    user  system elapsed 
## 482.195  59.792 212.380
pairs(zinb_10@W, col = colMerged, pch = 19, main = "ZINB")

Accounting for quality

mod <- model.matrix(~ qcpca$x[, 1:5])
print(system.time(zinb_3_qc <- zinbFit(core, ncores = 3, K = 3, X = mod)))
##    user  system elapsed 
## 363.469  46.409 159.956
pairs(zinb_3_qc@W, col=colMerged, pch=19, main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1 = zinb_3_qc@W[,1],
                 W2 = zinb_3_qc@W[,2],
                 W3 = zinb_3_qc@W[,3],
                 gamma_mu = zinb_3_qc@gamma_mu[1,],
                 gamma_pi = zinb_3_qc@gamma_pi[1,],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch = 19, col = colMerged)

print(cor(df, method = "spearman"))
##                         W1          W2           W3     gamma_mu
## W1              1.00000000 -0.03466116 -0.021067844  0.154529488
## W2             -0.03466116  1.00000000 -0.021384549 -0.064591459
## W3             -0.02106784 -0.02138455  1.000000000  0.006667748
## gamma_mu        0.15452949 -0.06459146  0.006667748  1.000000000
## gamma_pi       -0.02937107 -0.04003005  0.127357641 -0.281894727
## detection_rate  0.06250448  0.12131856 -0.160817375  0.466624520
## coverage        0.02182683 -0.10622722 -0.098061523  0.868247711
##                   gamma_pi detection_rate    coverage
## W1             -0.02937107     0.06250448  0.02182683
## W2             -0.04003005     0.12131856 -0.10622722
## W3              0.12735764    -0.16081737 -0.09806152
## gamma_mu       -0.28189473     0.46662452  0.86824771
## gamma_pi        1.00000000    -0.95438636 -0.26924454
## detection_rate -0.95438636     1.00000000  0.41994811
## coverage       -0.26924454     0.41994811  1.00000000

K = 5

print(system.time(zinb_5_qc <- zinbFit(core, ncores = 3, K = 5, X = mod)))
##    user  system elapsed 
## 418.028  49.259 192.562
pairs(zinb_5_qc@W, col=colMerged, pch=19, main="ZINB")

K = 10

print(system.time(zinb_10_qc <- zinbFit(core, ncores = 3, K = 10, X = mod)))
##    user  system elapsed 
## 644.981  67.262 278.458
pairs(zinb_10_qc@W, col=colMerged, pch=19, main="ZINB")

Accounting for batch

class(batch)
## [1] "factor"
mod <- model.matrix(~ batch)
dim(mod)
## [1] 616  18
print(system.time(zinb_3_batch <- zinbFit(core, ncores = 3, K = 3, X = mod)))
##    user  system elapsed 
## 420.864  49.121 178.302
pairs(zinb_3_batch@W, col=colMerged, pch=19, main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1 = zinb_3@W[,1],
                 W2 = zinb_3@W[,2],
                 W3 = zinb_3@W[,3],
                 gamma_mu = zinb_3@gamma_mu[1,],
                 gamma_pi = zinb_3@gamma_pi[1,],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch = 19, col = colMerged)

print(cor(df, method = "spearman"))
##                         W1          W2          W3    gamma_mu    gamma_pi
## W1              1.00000000 -0.02414526 -0.09064496  0.08205038  0.15669401
## W2             -0.02414526  1.00000000 -0.01166085 -0.03726806  0.20835852
## W3             -0.09064496 -0.01166085  1.00000000 -0.13109039 -0.01740836
## gamma_mu        0.08205038 -0.03726806 -0.13109039  1.00000000 -0.07958898
## gamma_pi        0.15669401  0.20835852 -0.01740836 -0.07958898  1.00000000
## detection_rate -0.15316526 -0.32159255  0.01930138  0.20659231 -0.97309429
## coverage        0.07916631  0.01772106  0.04078241  0.89501409 -0.32085525
##                detection_rate    coverage
## W1                -0.15316526  0.07916631
## W2                -0.32159255  0.01772106
## W3                 0.01930138  0.04078241
## gamma_mu           0.20659231  0.89501409
## gamma_pi          -0.97309429 -0.32085525
## detection_rate     1.00000000  0.41994811
## coverage           0.41994811  1.00000000

K = 5

print(system.time(zinb_5_batch <- zinbFit(core, ncores = 3, K = 5, X = mod)))
##    user  system elapsed 
## 501.115  60.472 237.849
pairs(zinb_5_batch@W, col=colMerged, pch=19, main="ZINB")

K = 10

print(system.time(zinb_10_batch <- zinbFit(core, ncores = 3, K = 10, X = mod)))
##    user  system elapsed 
## 764.353 123.040 381.030
pairs(zinb_10_batch@W, col=colMerged, pch=19, main="ZINB")

Accounting for batch and quality

mod <- model.matrix(~ qcpca$x[, 1:5] + batch)
print(system.time(zinb_3_qc_batch <- zinbFit(core, ncores = 3, K = 3, X = mod)))
##    user  system elapsed 
## 593.302  83.211 245.604
pairs(zinb_3_qc_batch@W, col=colMerged, pch=19, main="ZINB")

K = 5

print(system.time(zinb_5_qc_batch <- zinbFit(core, ncores = 3, K = 5, X = mod)))
##    user  system elapsed 
## 646.450 107.370 293.291
pairs(zinb_5_qc_batch@W, col=colMerged, pch=19, main="ZINB")

K = 10

print(system.time(zinb_10_qc_batch <- zinbFit(core, ncores = 3, K = 10, X = mod)))
##    user  system elapsed 
## 939.106 148.088 412.685
pairs(zinb_10_qc_batch@W, col=colMerged, pch=19, main="ZINB")

save(zinb_3, zinb_5, zinb_10,
     zinb_3_qc, zinb_5_qc, zinb_10_qc, 
     zinb_3_batch, zinb_5_batch, zinb_10_batch,
     zinb_3_qc_batch, zinb_5_qc_batch, zinb_10_qc_batch,
     pc_tmm, pc_fq, pc_tc, pc_raw,
     zifa_fq, zifa_tmm, zifa_tc, zifa_raw,
     file="olfactory.rda")